Fix attempting to combine Hangul Jamo 0x11a7 (#317)
authorDiego Frias <mail@dzfrias.dev>
Sat, 22 Nov 2025 18:42:18 +0000 (12:42 -0600)
committerGitHub <noreply@github.com>
Sat, 22 Nov 2025 18:42:18 +0000 (13:42 -0500)
* Fix attempting to combine Hangul Jamo 0x11a7

0x11a7 is not a valid Hangul T syllable despite being equal to T_BASE.
This is because, per the Unicode spec:

  TCount is set to one more than the number of trailing consonants
  relevant to the decomposition algorithm: (0x11C2 - 0x11A8 + 1) + 1

So the first valid Hangul T syllable is 0x11a8. Also see
https://www.unicode.org/versions/Unicode17.0.0/core-spec/chapter-3/#G59434
for where the spec describes the usage of 0x11a8, not 0x11a7, during
composition.

* document that utf8proc_map simply wraps utf8proc_decompose and utf8proc_reencode (#312)

* test code refactoring (#318)

* Write regression test for #317

---------

Co-authored-by: Steven G. Johnson <stevenj@alum.mit.edu>
test/misc.c
utf8proc.c

index bff793dea33e787407c111f936f652b0ace15578..7ea2ebcc0d3ea8d2d639dcaef3acd68d261e9964 100644 (file)
@@ -25,10 +25,30 @@ static void issue102(void) /* #102 */
     check_compare("NFKC_Casefold", input, correct, utf8proc_NFKC_Casefold(input), 1);
 }
 
+static void issue317(void) /* #317 */
+{
+    utf8proc_uint8_t input[] = {0xec, 0xa3, 0xa0, 0xe1, 0x86, 0xa7, 0x00}; /* "\uc8e0\u11a7" */
+    utf8proc_uint8_t combined[] = {0xec, 0xa3, 0xa, 0x00}; /* "\uc8e1" */
+    utf8proc_int32_t codepoint;
+
+    /* inputs that should *not* be combined* */
+    check_compare("NFC", input, input, utf8proc_NFC(input), 1);
+    utf8proc_encode_char(0x11c3, input+3);
+    check_compare("NFC", input, input, utf8proc_NFC(input), 1);
+
+    /* inputs that *should* be combined (TCOUNT-1 chars starting at TBASE+1) */
+    for (codepoint = 0x11a8; codepoint < 0x11c3; ++codepoint) {
+        utf8proc_encode_char(codepoint, input+3);
+        utf8proc_encode_char(0xc8e0 + (codepoint - 0x11a7), combined);
+        check_compare("NFC", input, combined, utf8proc_NFC(input), 1);
+    }
+}
+
 int main(void)
 {
     issue128();
     issue102();
+    issue317();
 #ifdef UNICODE_VERSION
     printf("Unicode version: Makefile has %s, has API %s\n", UNICODE_VERSION, utf8proc_unicode_version());
     check(!strcmp(UNICODE_VERSION, utf8proc_unicode_version()), "utf8proc_unicode_version mismatch");
index c59bad20f6096586fd2260dc738408463af2ceea..b9877c0aeed8500e69b57d7053534834975535c4 100644 (file)
@@ -684,7 +684,7 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_normalize_utf32(utf8proc_int32_t *b
             (hangul_sindex % UTF8PROC_HANGUL_TCOUNT) == 0) {
           utf8proc_int32_t hangul_tindex;
           hangul_tindex = current_char - UTF8PROC_HANGUL_TBASE;
-          if (hangul_tindex >= 0 && hangul_tindex < UTF8PROC_HANGUL_TCOUNT) {
+          if (hangul_tindex > 0 && hangul_tindex < UTF8PROC_HANGUL_TCOUNT) {
             *starter += hangul_tindex;
             starter_property = NULL;
             continue;